I have a typical project of predicting the NYC uber/lyft trip demand. The dataset is available from Jan2022 to March 2023. The area is already divided into different locations. and I want the predicted demand for each location every 15 mins
The goal of this project is to predict the demand for Uber/Lyft trips in different locations of NYC every 15 minutes, using a dataset spanning from January 2022 to March 2023. The dataset includes information such as the dispatching base number, pickup datetime, drop-off datetime, pickup location ID, drop-off location ID, SR_Flag, and affiliated base number
import pandas as pd # Import the pandas library to work with data in a tabular format,
import glob # glob library to retrieve file paths using patterns
import tqdm # ibrary to display progress bars
import plotly.graph_objects as go # library to create interactive plots
from statsmodels.tsa.arima.model import ARIMA # class from statsmodels.tsa.arima.model to perform ARIMA modeling
from dateutil.relativedelta import relativedelta # from dateutil.relativedelta to manipulate dates, numpy for numerical operations
import numpy as np # for numerical operations
from pmdarima import auto_arima # auto_arima from pmdarima library to automatically select the best ARIMA model
# Summary :
# Overall, this code imports the necessary libraries
# for time series analysis, including ARIMA modeling,
# and utilizes the auto_arima function to automatically
# select the best ARIMA model based on the data provided.
# Uses the glob.glob function to retrieve a list of file paths that match the specified
# pattern 'Datasets/fhv_tripdata_2022-2023_in_csv/*.csv'.
# This pattern is used to find all CSV files in the given directory.
data_list_path = glob.glob('Datasets/fhv_tripdata_2022-2023_in_csv/*.csv')
# Initializes an empty list called list_df to store the DataFrames.
list_df = []
# terates over each file path in data_list_path
for path in data_list_path:
print(path)
# Step 1: Preprocess the Dataset
# inside the loop, it reads each CSV file using pd.read_csv and assigns it to the variable df.
df = pd.read_csv(path)
# Appends the DataFrame to the list_df list.
list_df.append(df)
# After the loop, it concatenates all the DataFrames in list_df into a single DataFrame using pd.concat.
# The concatenated DataFrame is assigned to the variable df
df = pd.concat(list_df)
# Specifies a list of column names ('pickup_datetime' and 'PUlocationID')
# in interested_features that you are interested in keeping
interested_features = ['pickup_datetime','PUlocationID']
# Updates df to contain only the columns specified in interested_features using indexing.
df = df[interested_features]
# Summary :
# Overall, this code reads multiple CSV files from the specified directory,
# concatenates them into a single DataFrame, and then selects and keeps only the columns specified in interested_features
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-09.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-02.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-04.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-07.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-01.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-06.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-08.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-03.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-11.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-12.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-02.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-03.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-01.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-05.csv Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-10.csv
# The code imports the necessary libraries:
import pandas as pd
import pmdarima as pm
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
# Prints the number of rows in the DataFrame df before removing rows with NaN values
# This line uses the .shape[0] attribute of a DataFrame to retrieve the number of rows.
print('Number of Rows Before Removing NaN:', df.shape[0])
# Removes rows with NaN values from the DataFrame df and assigns the result to removed_nan_df:
removed_nan_df = df.dropna()
#The .dropna() method is used to remove rows containing any NaN values.
#The resulting DataFrame with NaN rows removed is assigned to removed_nan_df/
print('Number of Rows After Removing NaN:', removed_nan_df.shape[0])
Number of Rows Before Removing NaN: 17712727 Number of Rows After Removing NaN: 4164902
from prophet import Prophet
import os
print('Number of Rows Before Removing NaN:', df.shape[0])
removed_nan_df = df.dropna()
print('Number of Rows After Removing NaN:', removed_nan_df.shape[0])
location_ids = removed_nan_df['PUlocationID'].unique().tolist()
for lc_id in location_ids:
print('Location ID:', lc_id)
df_subset = removed_nan_df[removed_nan_df['PUlocationID'] == lc_id]
df_subset['pickup_datetime'] = pd.to_datetime(df_subset['pickup_datetime'])
df_subset = df_subset.sort_values('pickup_datetime')
df_subset = df_subset.set_index('pickup_datetime')
df_subset = df_subset['PUlocationID'].resample('1H').count()
df_subset = df_subset.reset_index()
# Split data into training and testing sets
train_size = int(len(df_subset) * 0.95)
train_data = df_subset[:train_size]
test_data = df_subset[train_size:]
# Prepare data for Prophet model
prophet_train_data = train_data.rename(columns={'pickup_datetime': 'ds', 'PUlocationID': 'y'})
# Create and fit the Prophet model
model = Prophet(
seasonality_mode='additive',
daily_seasonality=True, # Disable daily seasonality
weekly_seasonality=True, # Enable weekly seasonality
yearly_seasonality=False, # Disable yearly seasonality
)
model.fit(prophet_train_data)
# Generate future dates for prediction
future_dates = model.make_future_dataframe(periods=len(test_data), freq='H')
# Make predictions
forecast = model.predict(future_dates)
forecast = forecast[['ds', 'yhat']][-len(test_data):]
# Create a new dataframe to store actual and predicted values
result_df = pd.DataFrame({
'Actual': test_data['PUlocationID'].values,
'Prediction': forecast['yhat'].values
})
# Save the dataframe as a CSV file with the location ID as the filename
filename = f'prophet-results/{lc_id}.csv'
result_df.to_csv(filename, index=False)
# Plotting
fig = go.Figure()
fig.add_trace(go.Scatter(x=prophet_train_data['ds'], y=prophet_train_data['y'], mode='lines+markers', name='Training Data'))
fig.add_trace(go.Scatter(x=test_data['pickup_datetime'], y=test_data['PUlocationID'], mode='lines+markers', name='Testing Data'))
fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat'], mode='lines+markers', name='Prophet Forecast'))
fig.update_layout(title=f'PickLocation ID: {lc_id} - Facebook Prophet', xaxis_title='Time', yaxis_title='Number Drives')
fig.show()
break
/home/iffi/anaconda3/envs/sep_darts_2/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Number of Rows Before Removing NaN: 17712727 Number of Rows After Removing NaN: 4164902 Location ID: 12.0
/tmp/ipykernel_10821/1487895832.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_subset['pickup_datetime'] = pd.to_datetime(df_subset['pickup_datetime']) 20:10:14 - cmdstanpy - INFO - Chain [1] start processing 20:10:18 - cmdstanpy - INFO - Chain [1] done processing